Podaci

data1 <- read.delim("realestate.txt")
data2 <- read_excel("Dry_Bean_Dataset.xlsx") |> mutate(Class = factor(Class))
data3 <- read_excel("kreditnekartice.xlsx")
realestate <- lm(SalesPrice ~ ., data = data1)
realestate2 <- lm(SalesPrice ~ SqFeet+Quality+Baths, data = data1)
realestate3 <- lm(SalesPrice ~ Beds+Garage+Year, data = data1)
realestate_aug <- augment(realestate)

Multipla regresija

Deskriptivna analiza

data1 |> summary()
##    SalesPrice        SqFeet           Beds           Baths      
##  Min.   : 84.0   Min.   :0.980   Min.   :1.000   Min.   :1.000  
##  1st Qu.:180.0   1st Qu.:1.701   1st Qu.:3.000   1st Qu.:2.000  
##  Median :229.9   Median :2.061   Median :3.000   Median :3.000  
##  Mean   :277.4   Mean   :2.261   Mean   :3.478   Mean   :2.647  
##  3rd Qu.:335.0   3rd Qu.:2.638   3rd Qu.:4.000   3rd Qu.:3.000  
##  Max.   :920.0   Max.   :5.032   Max.   :7.000   Max.   :7.000  
##     AirCond           Garage           Pool             Year     
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :1885  
##  1st Qu.:1.0000   1st Qu.:2.000   1st Qu.:0.0000   1st Qu.:1956  
##  Median :1.0000   Median :2.000   Median :0.0000   Median :1966  
##  Mean   :0.8311   Mean   :2.098   Mean   :0.0691   Mean   :1967  
##  3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:0.0000   3rd Qu.:1981  
##  Max.   :1.0000   Max.   :7.000   Max.   :1.0000   Max.   :1998  
##     Quality          Style             Lot           Highway       
##  Min.   :1.000   Min.   : 1.000   Min.   : 4.56   Min.   :0.00000  
##  1st Qu.:2.000   1st Qu.: 1.000   1st Qu.:17.16   1st Qu.:0.00000  
##  Median :2.000   Median : 2.000   Median :22.20   Median :0.00000  
##  Mean   :2.186   Mean   : 3.349   Mean   :24.34   Mean   :0.02111  
##  3rd Qu.:3.000   3rd Qu.: 7.000   3rd Qu.:26.78   3rd Qu.:0.00000  
##  Max.   :3.000   Max.   :11.000   Max.   :86.83   Max.   :1.00000
summary(realestate)
## 
## Call:
## lm(formula = SalesPrice ~ ., data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -186.25  -37.55   -2.36   31.70  293.79 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2351.8910   433.9996  -5.419 9.26e-08 ***
## SqFeet        129.7452     7.6675  16.921  < 2e-16 ***
## Beds           -8.2357     3.5350  -2.330   0.0202 *  
## Baths           4.7172     4.6687   1.010   0.3128    
## AirCond       -13.5533     8.5834  -1.579   0.1150    
## Garage         13.5320     5.4562   2.480   0.0135 *  
## Pool            8.8900    11.2356   0.791   0.4292    
## Year            1.2410     0.2188   5.671 2.38e-08 ***
## Quality       -46.6914     7.4423  -6.274 7.54e-10 ***
## Style          -9.4818     1.4305  -6.628 8.66e-11 ***
## Lot             1.1536     0.2579   4.473 9.51e-06 ***
## Highway       -37.4620    19.6072  -1.911   0.0566 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 63.41 on 509 degrees of freedom
## Multiple R-squared:  0.7922, Adjusted R-squared:  0.7877 
## F-statistic: 176.4 on 11 and 509 DF,  p-value: < 2.2e-16
glance(realestate)
## # A tibble: 1 × 12
##   r.squared adj.r.squ…¹ sigma stati…²   p.value    df logLik   AIC   BIC devia…³
##       <dbl>       <dbl> <dbl>   <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>   <dbl>
## 1     0.792       0.788  63.4    176. 1.05e-165    11 -2895. 5816. 5872.  2.05e6
## # … with 2 more variables: df.residual <int>, nobs <int>, and abbreviated
## #   variable names ¹​adj.r.squared, ²​statistic, ³​deviance
ggplot(data = data1) +
  aes(x = SqFeet) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data1) +
  aes(x = Baths) +
  geom_bar() 

ggplot(data = data1) +
  aes(x = Quality) +
  geom_bar() 

ggplot(data = data1) +
  aes(x = Garage) +
  geom_bar() 

ggplot(data = data1) +
  aes(x = Year) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Multikolinearnost

pairs.panels(data1[,c(1:12)], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = TRUE,  # show density plots
             ellipse = FALSE
             )

Dijagnostički testovi

ggplot(data = realestate_aug, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  xlab("Fitted values") +
  ylab("Residuals")

p <- ggplot(realestate_aug, aes(sample =.resid))
p+ stat_qq_point(size = 2) +
  stat_qq_line() +
  xlab("Theoretical Quantiles") + ylab("Sample Quantiles")

ggplot(data = realestate_aug, aes(x = .resid)) +
  geom_histogram(binwidth =20) +
  xlab("Residuals")

gg_resleverage(realestate)
## `geom_smooth()` using formula 'y ~ x'

gg_cooksd(realestate)

Predikcija za puni model

predict(realestate, newdata=data.frame(Beds=c(3), AirCond=c(3), Style=c(3), Year=c(1998), SqFeet=c(2), Baths=c(3), Garage=c(2), Highway=c(0), Lot=c(22), Quality=c(2), Pool=c(1)), interval="prediction")
##       fit      lwr      upr
## 1 275.464 144.1638 406.7643

Predikcija za površinu, broj kupaonica i kvalitetu

predict(realestate2, newdata=data.frame(SqFeet=c(2.3), Baths=c(2), Quality=c(2)), interval="prediction")
##        fit      lwr      upr
## 1 291.8885 153.8461 429.9308

Predikcija za broj kreveta, garažu i godinu izgradnje

predict(realestate3, newdata=data.frame(Beds=c(3), Garage=c(2), Year=c(1998)), interval="prediction")
##       fit      lwr      upr
## 1 335.662 139.1182 532.2058

Nakon provođenja deskriptivne analize uočili smo logičke varijable Highway, AirCond i Pool. Kvantitativne varijable su SalesPrice, SqFeet, Beds, Baths, Garage, Year, Quality, Style i Lot, a kvalitativne varijable nisu uočene.

Napravili smo puni model sa svim varijablama, gdje predviđamo vrijednost varijable SalesPrice. Nakon toga na temelju analize multikolinearnosti odabrali smo varijable koje najviše utjeću na SalesPrice i prema njima napravili dva reducirana modela.

Pomoću grafa reziduala i Cook-ovog dijagrama smo proveli dijagnostičke testove i zaključili da je opservacija 96 potencijalno stršilo. Smatramo da je to uzrokovano varijablama godina izgradnje (Year) i veličina dvorišta (Lot).

Prvi reducirani model je sadržavao varijable SqFeet, Quality i Baths, a drugi Beds, Garage i Year.

Logistička regresija

Deskriptivna analiza

summary(data3)
##        ID         iznosKredita          spol        obrazovanje   
##  Min.   :    1   Min.   :  10000   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 7501   1st Qu.:  50000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :15000   Median : 140000   Median :2.000   Median :2.000  
##  Mean   :15000   Mean   : 167484   Mean   :1.604   Mean   :1.853  
##  3rd Qu.:22500   3rd Qu.: 240000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :30000   Max.   :1000000   Max.   :2.000   Max.   :6.000  
##   bracnoStanje        dob          placanje1         placanje2      
##  Min.   :0.000   Min.   :21.00   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:1.000   1st Qu.:28.00   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median :2.000   Median :34.00   Median : 0.0000   Median : 0.0000  
##  Mean   :1.552   Mean   :35.49   Mean   :-0.0167   Mean   :-0.1338  
##  3rd Qu.:2.000   3rd Qu.:41.00   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :3.000   Max.   :79.00   Max.   : 8.0000   Max.   : 8.0000  
##    placanje3         placanje4         placanje5         placanje6      
##  Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.0000   Median : 0.0000  
##  Mean   :-0.1662   Mean   :-0.2207   Mean   :-0.2662   Mean   :-0.2911  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000  
##     racuni1           racuni2          racuni3           racuni4       
##  Min.   :-165580   Min.   :-69777   Min.   :-157264   Min.   :-170000  
##  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666   1st Qu.:   2327  
##  Median :  22382   Median : 21200   Median :  20089   Median :  19052  
##  Mean   :  51223   Mean   : 49179   Mean   :  47013   Mean   :  43263  
##  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165   3rd Qu.:  54506  
##  Max.   : 964511   Max.   :983931   Max.   :1664089   Max.   : 891586  
##     racuni5          racuni6           placeno1         placeno2      
##  Min.   :-81334   Min.   :-339603   Min.   :     0   Min.   :      0  
##  1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000   1st Qu.:    833  
##  Median : 18105   Median :  17071   Median :  2100   Median :   2009  
##  Mean   : 40311   Mean   :  38872   Mean   :  5664   Mean   :   5921  
##  3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006   3rd Qu.:   5000  
##  Max.   :927171   Max.   : 961664   Max.   :873552   Max.   :1684259  
##     placeno3         placeno4         placeno5           placeno6       
##  Min.   :     0   Min.   :     0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5   1st Qu.:   117.8  
##  Median :  1800   Median :  1500   Median :  1500.0   Median :  1500.0  
##  Mean   :  5226   Mean   :  4826   Mean   :  4799.4   Mean   :  5215.5  
##  3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5   3rd Qu.:  4000.0  
##  Max.   :896040   Max.   :621000   Max.   :426529.0   Max.   :528666.0  
##    kasnjenje     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.2212  
##  3rd Qu.:0.0000  
##  Max.   :1.0000
ind3 <- sample(c(TRUE, FALSE), nrow(data3), replace=T, prob = c(0.8, 0.2))
train3 <- data3[ind3, ]
test3 <- data3[!ind3, ]
kreditne <- glm(kasnjenje ~., family = "binomial", data = train3)
kreditne2 <- glm(kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + dob + placanje1 + placanje2 + placanje3 + placanje5 + racuni1 + racuni2 + racuni3 + placeno1 + placeno2 + placeno4 + placeno5, family = "binomial", data = train3)
summary(kreditne)
## 
## Call:
## glm(formula = kasnjenje ~ ., family = "binomial", data = train3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1420  -0.7000  -0.5507  -0.2954   3.8378  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -6.762e-01  1.350e-01  -5.008 5.50e-07 ***
## ID           -4.574e-07  1.954e-06  -0.234 0.814901    
## iznosKredita -6.807e-07  1.746e-07  -3.898 9.69e-05 ***
## spol         -1.304e-01  3.436e-02  -3.794 0.000148 ***
## obrazovanje  -9.586e-02  2.348e-02  -4.082 4.46e-05 ***
## bracnoStanje -1.527e-01  3.541e-02  -4.312 1.61e-05 ***
## dob           7.801e-03  1.983e-03   3.935 8.34e-05 ***
## placanje1     5.728e-01  1.975e-02  29.008  < 2e-16 ***
## placanje2     9.034e-02  2.247e-02   4.020 5.81e-05 ***
## placanje3     7.283e-02  2.523e-02   2.887 0.003890 ** 
## placanje4     7.964e-03  2.799e-02   0.285 0.775969    
## placanje5     4.119e-02  3.006e-02   1.370 0.170636    
## placanje6     8.150e-03  2.472e-02   0.330 0.741657    
## racuni1      -4.800e-06  1.240e-06  -3.871 0.000108 ***
## racuni2       4.242e-07  1.682e-06   0.252 0.800945    
## racuni3       2.473e-06  1.460e-06   1.694 0.090296 .  
## racuni4       3.096e-07  1.468e-06   0.211 0.832974    
## racuni5      -7.079e-07  1.720e-06  -0.411 0.680717    
## racuni6       1.144e-06  1.388e-06   0.824 0.409864    
## placeno1     -1.126e-05  2.408e-06  -4.675 2.94e-06 ***
## placeno2     -1.015e-05  2.304e-06  -4.405 1.06e-05 ***
## placeno3     -1.470e-06  1.808e-06  -0.813 0.416095    
## placeno4     -3.352e-06  1.952e-06  -1.717 0.085994 .  
## placeno5     -3.203e-06  1.938e-06  -1.653 0.098254 .  
## placeno6     -3.755e-06  1.540e-06  -2.438 0.014759 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25224  on 23813  degrees of freedom
## Residual deviance: 22236  on 23789  degrees of freedom
## AIC: 22286
## 
## Number of Fisher Scoring iterations: 6
glance(kreditne)
## # A tibble: 1 × 8
##   null.deviance df.null  logLik    AIC    BIC deviance df.residual  nobs
##           <dbl>   <int>   <dbl>  <dbl>  <dbl>    <dbl>       <int> <int>
## 1        25224.   23813 -11118. 22286. 22488.   22236.       23789 23814
ggplot(data = data3) +
  aes(x = iznosKredita) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data3) +
  aes(x = spol) +
  geom_bar()

ggplot(data = data3) +
  aes(x = obrazovanje) +
  geom_bar()

ggplot(data = data3) +
  aes(x = dob) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Predikcije

test3.predicted <- predict(kreditne, newdata = test3, type = "response")
kreditnatest05 = table(test3$kasnjenje, test3.predicted > 0.5) |> prop.table() |> round(3)
kreditnatest07 = table(test3$kasnjenje, test3.predicted > 0.7) |> prop.table() |> round(3)
table(test3$kasnjenje, test3.predicted > 0.5)
##    
##     FALSE TRUE
##   0  4719  121
##   1  1005  341
table(test3$kasnjenje, test3.predicted > 0.7)
##    
##     FALSE TRUE
##   0  4815   25
##   1  1287   59

StepAIC

stepkreditne <- stepAIC(kreditne, direction="both")
## Start:  AIC=22286.09
## kasnjenje ~ ID + iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje4 + placanje5 + 
##     placanje6 + racuni1 + racuni2 + racuni3 + racuni4 + racuni5 + 
##     racuni6 + placeno1 + placeno2 + placeno3 + placeno4 + placeno5 + 
##     placeno6
## 
##                Df Deviance   AIC
## - racuni4       1    22236 22284
## - ID            1    22236 22284
## - racuni2       1    22236 22284
## - placanje4     1    22236 22284
## - placanje6     1    22236 22284
## - racuni5       1    22236 22284
## - placeno3      1    22237 22285
## - racuni6       1    22237 22285
## - placanje5     1    22238 22286
## <none>               22236 22286
## - racuni3       1    22239 22287
## - placeno5      1    22239 22287
## - placeno4      1    22239 22287
## - placeno6      1    22243 22291
## - placanje3     1    22244 22292
## - spol          1    22250 22298
## - dob           1    22252 22300
## - iznosKredita  1    22252 22300
## - placanje2     1    22252 22300
## - obrazovanje   1    22253 22301
## - racuni1       1    22253 22301
## - bracnoStanje  1    22255 22303
## - placeno2      1    22260 22308
## - placeno1      1    22264 22312
## - placanje1     1    23075 23123
## 
## Step:  AIC=22284.14
## kasnjenje ~ ID + iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje4 + placanje5 + 
##     placanje6 + racuni1 + racuni2 + racuni3 + racuni5 + racuni6 + 
##     placeno1 + placeno2 + placeno3 + placeno4 + placeno5 + placeno6
## 
##                Df Deviance   AIC
## - ID            1    22236 22282
## - racuni2       1    22236 22282
## - placanje4     1    22236 22282
## - placanje6     1    22236 22282
## - racuni5       1    22236 22282
## - placeno3      1    22237 22283
## - racuni6       1    22237 22283
## - placanje5     1    22238 22284
## <none>               22236 22284
## - placeno5      1    22239 22285
## - racuni3       1    22240 22286
## + racuni4       1    22236 22286
## - placeno4      1    22240 22286
## - placeno6      1    22243 22289
## - placanje3     1    22244 22290
## - spol          1    22251 22297
## - dob           1    22252 22298
## - iznosKredita  1    22252 22298
## - placanje2     1    22252 22298
## - obrazovanje   1    22253 22299
## - racuni1       1    22253 22299
## - bracnoStanje  1    22255 22301
## - placeno2      1    22261 22307
## - placeno1      1    22264 22310
## - placanje1     1    23075 23121
## 
## Step:  AIC=22282.19
## kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje4 + placanje5 + 
##     placanje6 + racuni1 + racuni2 + racuni3 + racuni5 + racuni6 + 
##     placeno1 + placeno2 + placeno3 + placeno4 + placeno5 + placeno6
## 
##                Df Deviance   AIC
## - racuni2       1    22236 22280
## - placanje4     1    22236 22280
## - placanje6     1    22236 22280
## - racuni5       1    22236 22280
## - placeno3      1    22237 22281
## - racuni6       1    22237 22281
## - placanje5     1    22238 22282
## <none>               22236 22282
## - placeno5      1    22239 22283
## - racuni3       1    22240 22284
## + ID            1    22236 22284
## + racuni4       1    22236 22284
## - placeno4      1    22240 22284
## - placeno6      1    22243 22287
## - placanje3     1    22245 22289
## - spol          1    22251 22295
## - dob           1    22252 22296
## - iznosKredita  1    22252 22296
## - placanje2     1    22252 22296
## - obrazovanje   1    22253 22297
## - racuni1       1    22253 22297
## - bracnoStanje  1    22255 22299
## - placeno2      1    22261 22305
## - placeno1      1    22265 22309
## - placanje1     1    23075 23119
## 
## Step:  AIC=22280.25
## kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje4 + placanje5 + 
##     placanje6 + racuni1 + racuni3 + racuni5 + racuni6 + placeno1 + 
##     placeno2 + placeno3 + placeno4 + placeno5 + placeno6
## 
##                Df Deviance   AIC
## - placanje4     1    22236 22278
## - placanje6     1    22236 22278
## - racuni5       1    22236 22278
## - placeno3      1    22237 22279
## - racuni6       1    22237 22279
## - placanje5     1    22238 22280
## <none>               22236 22280
## - placeno5      1    22239 22281
## + racuni2       1    22236 22282
## + ID            1    22236 22282
## + racuni4       1    22236 22282
## - placeno4      1    22240 22282
## - racuni3       1    22242 22284
## - placeno6      1    22243 22285
## - placanje3     1    22245 22287
## - spol          1    22251 22293
## - dob           1    22252 22294
## - iznosKredita  1    22252 22294
## - placanje2     1    22252 22294
## - obrazovanje   1    22253 22295
## - bracnoStanje  1    22255 22297
## - placeno2      1    22266 22308
## - racuni1       1    22270 22312
## - placeno1      1    22271 22313
## - placanje1     1    23076 23118
## 
## Step:  AIC=22278.33
## kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje5 + placanje6 + 
##     racuni1 + racuni3 + racuni5 + racuni6 + placeno1 + placeno2 + 
##     placeno3 + placeno4 + placeno5 + placeno6
## 
##                Df Deviance   AIC
## - placanje6     1    22236 22276
## - racuni5       1    22237 22277
## - racuni6       1    22237 22277
## - placeno3      1    22237 22277
## <none>               22236 22278
## - placeno5      1    22239 22279
## - placanje5     1    22239 22279
## + placanje4     1    22236 22280
## + racuni2       1    22236 22280
## + ID            1    22236 22280
## + racuni4       1    22236 22280
## - placeno4      1    22241 22281
## - racuni3       1    22243 22283
## - placeno6      1    22243 22283
## - placanje3     1    22248 22288
## - spol          1    22251 22291
## - dob           1    22252 22292
## - iznosKredita  1    22252 22292
## - placanje2     1    22252 22292
## - obrazovanje   1    22253 22293
## - bracnoStanje  1    22255 22295
## - placeno2      1    22266 22306
## - racuni1       1    22270 22310
## - placeno1      1    22271 22311
## - placanje1     1    23082 23122
## 
## Step:  AIC=22276.44
## kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje5 + racuni1 + 
##     racuni3 + racuni5 + racuni6 + placeno1 + placeno2 + placeno3 + 
##     placeno4 + placeno5 + placeno6
## 
##                Df Deviance   AIC
## - racuni5       1    22237 22275
## - racuni6       1    22237 22275
## - placeno3      1    22237 22275
## <none>               22236 22276
## - placeno5      1    22240 22278
## + placanje6     1    22236 22278
## + placanje4     1    22236 22278
## + racuni2       1    22236 22278
## + ID            1    22236 22278
## + racuni4       1    22236 22278
## - placeno4      1    22241 22279
## - racuni3       1    22243 22281
## - placanje5     1    22243 22281
## - placeno6      1    22243 22281
## - placanje3     1    22248 22286
## - spol          1    22251 22289
## - dob           1    22252 22290
## - iznosKredita  1    22252 22290
## - placanje2     1    22253 22291
## - obrazovanje   1    22254 22292
## - bracnoStanje  1    22255 22293
## - placeno2      1    22266 22304
## - racuni1       1    22270 22308
## - placeno1      1    22271 22309
## - placanje1     1    23083 23121
## 
## Step:  AIC=22274.56
## kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje5 + racuni1 + 
##     racuni3 + racuni6 + placeno1 + placeno2 + placeno3 + placeno4 + 
##     placeno5 + placeno6
## 
##                Df Deviance   AIC
## - placeno3      1    22238 22274
## - racuni6       1    22238 22274
## <none>               22237 22275
## - placeno5      1    22240 22276
## + racuni5       1    22236 22276
## + placanje6     1    22237 22277
## + placanje4     1    22237 22277
## + racuni2       1    22237 22277
## + ID            1    22237 22277
## + racuni4       1    22237 22277
## - placeno4      1    22242 22278
## - racuni3       1    22243 22279
## - placanje5     1    22243 22279
## - placeno6      1    22243 22279
## - placanje3     1    22248 22284
## - spol          1    22251 22287
## - dob           1    22252 22288
## - iznosKredita  1    22253 22289
## - placanje2     1    22253 22289
## - obrazovanje   1    22254 22290
## - bracnoStanje  1    22255 22291
## - placeno2      1    22266 22302
## - racuni1       1    22271 22307
## - placeno1      1    22271 22307
## - placanje1     1    23083 23119
## 
## Step:  AIC=22273.5
## kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje5 + racuni1 + 
##     racuni3 + racuni6 + placeno1 + placeno2 + placeno4 + placeno5 + 
##     placeno6
## 
##                Df Deviance   AIC
## - racuni6       1    22238 22272
## <none>               22238 22274
## + placeno3      1    22237 22275
## - placeno5      1    22241 22275
## + racuni5       1    22237 22275
## + racuni4       1    22237 22275
## + placanje4     1    22237 22275
## + placanje6     1    22237 22275
## + ID            1    22237 22275
## + racuni2       1    22237 22275
## - placeno4      1    22243 22277
## - placanje5     1    22244 22278
## - placeno6      1    22245 22279
## - racuni3       1    22246 22280
## - placanje3     1    22249 22283
## - spol          1    22252 22286
## - dob           1    22253 22287
## - placanje2     1    22254 22288
## - iznosKredita  1    22254 22288
## - obrazovanje   1    22255 22289
## - bracnoStanje  1    22256 22290
## - placeno2      1    22270 22304
## - racuni1       1    22274 22308
## - placeno1      1    22275 22309
## - placanje1     1    23087 23121
## 
## Step:  AIC=22272.22
## kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje5 + racuni1 + 
##     racuni3 + placeno1 + placeno2 + placeno4 + placeno5 + placeno6
## 
##                Df Deviance   AIC
## <none>               22238 22272
## - placeno5      1    22241 22273
## + racuni6       1    22238 22274
## + placeno3      1    22238 22274
## + placanje6     1    22238 22274
## + racuni5       1    22238 22274
## + placanje4     1    22238 22274
## + racuni2       1    22238 22274
## + ID            1    22238 22274
## + racuni4       1    22238 22274
## - placeno4      1    22243 22275
## - placanje5     1    22245 22277
## - placeno6      1    22246 22278
## - placanje3     1    22250 22282
## - spol          1    22252 22284
## - racuni3       1    22253 22285
## - dob           1    22254 22286
## - placanje2     1    22254 22286
## - iznosKredita  1    22254 22286
## - obrazovanje   1    22256 22288
## - bracnoStanje  1    22257 22289
## - placeno2      1    22273 22305
## - racuni1       1    22275 22307
## - placeno1      1    22276 22308
## - placanje1     1    23090 23122
stepkreditne$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## kasnjenje ~ ID + iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje4 + placanje5 + 
##     placanje6 + racuni1 + racuni2 + racuni3 + racuni4 + racuni5 + 
##     racuni6 + placeno1 + placeno2 + placeno3 + placeno4 + placeno5 + 
##     placeno6
## 
## Final Model:
## kasnjenje ~ iznosKredita + spol + obrazovanje + bracnoStanje + 
##     dob + placanje1 + placanje2 + placanje3 + placanje5 + racuni1 + 
##     racuni3 + placeno1 + placeno2 + placeno4 + placeno5 + placeno6
## 
## 
##          Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1                               23789   22236.09 22286.09
## 2   - racuni4  1 0.04434569     23790   22236.14 22284.14
## 3        - ID  1 0.04698887     23791   22236.19 22282.19
## 4   - racuni2  1 0.06902413     23792   22236.25 22280.25
## 5 - placanje4  1 0.07548744     23793   22236.33 22278.33
## 6 - placanje6  1 0.11093016     23794   22236.44 22276.44
## 7   - racuni5  1 0.12142213     23795   22236.56 22274.56
## 8  - placeno3  1 0.94000079     23796   22237.50 22273.50
## 9   - racuni6  1 0.71718400     23797   22238.22 22272.22

Predikcije finalni model

test3.predicted <- predict(kreditne2, newdata = test3, type = "response")
kreditnatest05.2 = table(test3$kasnjenje, test3.predicted > 0.5) |> prop.table() |> round(3)
kreditnatest07.2 = table(test3$kasnjenje, test3.predicted > 0.7) |> prop.table() |> round(3)
table(test3$kasnjenje, test3.predicted > 0.5)
##    
##     FALSE TRUE
##   0  4719  121
##   1  1010  336
table(test3$kasnjenje, test3.predicted > 0.7)
##    
##     FALSE TRUE
##   0  4816   24
##   1  1286   60

Nakon izrade deskrpitivne analize smo došli do zaključka da je varijabla kasnjenje jedina logička varijabla, te nam je ta varijabla bitna za predikcije, s obzirom da istražujemo kreditne kartice. Po pitanju ostalih varijabli uočili smo da su spol, bracnoStanje i obrazovanje kvalitativne varijable. Sve ostale varijable su kvantitativne.

Točnost pre-stepAIC klasifikacije je bila 76.33% za p = 0.5, i 72.31% za p = 0.7. Zaključili smo da je klasifikacija točnija za vrijednost p = 0.5.

Funkcijom stepAIC došli smo do zaključka da finalani prediktvni model treba sadržavati varijable iznosKredita, spol, obrazovanje, bracnoStanje, dob, placanje1, placanje2, placanje3, placanje5, racuni1, racuni2, racuni6, placeno1, placeno2, placeno3, placeno4 i placeno5.

Točnost post-stepAIC klasifikacije je bila 76.2% za p = 0.5, i 72.25% za p = 0.7. Zaključili smo da je klasifikacija točnija za vrijednost p = 0.5.

Točnost post-stepAIC modela je vrlo bliska točnosti punog modela iako sadrži 7 varijabli manje.

Regresijsko stablo odlučivanja

set.seed(1234)
ind <- sample(2, nrow(data1), replace=T, prob = c(0.7, 0.3))
train <- data1[ind == 1,]
test <- data1[ind == 2,]
tree <- rpart(SalesPrice ~., data = train, method='anova')
rpart.plot(tree)

printcp(tree)
## 
## Regression tree:
## rpart(formula = SalesPrice ~ ., data = train, method = "anova")
## 
## Variables actually used in tree construction:
## [1] Baths   Lot     Quality SqFeet  Year   
## 
## Root node error: 6967815/363 = 19195
## 
## n= 363 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.555951      0   1.00000 1.01182 0.105439
## 2 0.153545      1   0.44405 0.45481 0.048247
## 3 0.045367      2   0.29050 0.35009 0.040930
## 4 0.030290      3   0.24514 0.26460 0.034845
## 5 0.025240      4   0.21485 0.26725 0.035304
## 6 0.011862      5   0.18961 0.24147 0.029276
## 7 0.010000      6   0.17775 0.24375 0.030209
plotcp(tree)

Stablo koje smo vizualizirali ima 6 čvorova jer smo u printcp funckiji za vrijednost 6 dobili najmanju xerror vrijednost.

U multiploj regresiji smo izračunali predikciju vrijednost varijable SalesPrice i dobili rezultat 275.464, dok smo kroz regresijsko stablo odlučivanja dobili rezulat 244. Dobivena vrijednsot je bila približna.

Klasifikacijsko stablo odlučivanja

set.seed(1243)
summary(data2)
##       Area          Perimeter      MajorAxisLength MinorAxisLength
##  Min.   : 20420   Min.   : 524.7   Min.   :183.6   Min.   :122.5  
##  1st Qu.: 36328   1st Qu.: 703.5   1st Qu.:253.3   1st Qu.:175.8  
##  Median : 44652   Median : 794.9   Median :296.9   Median :192.4  
##  Mean   : 53048   Mean   : 855.3   Mean   :320.1   Mean   :202.3  
##  3rd Qu.: 61332   3rd Qu.: 977.2   3rd Qu.:376.5   3rd Qu.:217.0  
##  Max.   :254616   Max.   :1985.4   Max.   :738.9   Max.   :460.2  
##                                                                   
##   AspectRation    Eccentricity      ConvexArea     EquivDiameter  
##  Min.   :1.025   Min.   :0.2190   Min.   : 20684   Min.   :161.2  
##  1st Qu.:1.432   1st Qu.:0.7159   1st Qu.: 36715   1st Qu.:215.1  
##  Median :1.551   Median :0.7644   Median : 45178   Median :238.4  
##  Mean   :1.583   Mean   :0.7509   Mean   : 53768   Mean   :253.1  
##  3rd Qu.:1.707   3rd Qu.:0.8105   3rd Qu.: 62294   3rd Qu.:279.4  
##  Max.   :2.430   Max.   :0.9114   Max.   :263261   Max.   :569.4  
##                                                                   
##      Extent          Solidity        roundness       Compactness    
##  Min.   :0.5553   Min.   :0.9192   Min.   :0.4896   Min.   :0.6406  
##  1st Qu.:0.7186   1st Qu.:0.9857   1st Qu.:0.8321   1st Qu.:0.7625  
##  Median :0.7599   Median :0.9883   Median :0.8832   Median :0.8013  
##  Mean   :0.7497   Mean   :0.9871   Mean   :0.8733   Mean   :0.7999  
##  3rd Qu.:0.7869   3rd Qu.:0.9900   3rd Qu.:0.9169   3rd Qu.:0.8343  
##  Max.   :0.8662   Max.   :0.9947   Max.   :0.9907   Max.   :0.9873  
##                                                                     
##   ShapeFactor1       ShapeFactor2        ShapeFactor3     ShapeFactor4   
##  Min.   :0.002778   Min.   :0.0005642   Min.   :0.4103   Min.   :0.9477  
##  1st Qu.:0.005900   1st Qu.:0.0011535   1st Qu.:0.5814   1st Qu.:0.9937  
##  Median :0.006645   Median :0.0016935   Median :0.6420   Median :0.9964  
##  Mean   :0.006564   Mean   :0.0017159   Mean   :0.6436   Mean   :0.9951  
##  3rd Qu.:0.007271   3rd Qu.:0.0021703   3rd Qu.:0.6960   3rd Qu.:0.9979  
##  Max.   :0.010451   Max.   :0.0036650   Max.   :0.9748   Max.   :0.9997  
##                                                                          
##       Class     
##  BARBUNYA:1322  
##  BOMBAY  : 522  
##  CALI    :1630  
##  DERMASON:3546  
##  HOROZ   :1928  
##  SEKER   :2027  
##  SIRA    :2636
ggplot(data = data2) +
aes(x = Class) +
geom_bar()

ind2 <- sample(c(TRUE, FALSE), nrow(data2), replace=T, prob = c(0.8, 0.2))
train2 <- data2[ind2, ]
test2 <- data2[!ind2, ]
tree2 <- rpart(Class~., data = train2, method='class')
rpart.plot(tree2, box.palette = as.list(rainbow(7)))

printcp(tree2)
## 
## Classification tree:
## rpart(formula = Class ~ ., data = train2, method = "class")
## 
## Variables actually used in tree construction:
## [1] Compactness     MajorAxisLength MinorAxisLength Perimeter      
## [5] roundness       ShapeFactor1   
## 
## Root node error: 7998/10834 = 0.73823
## 
## n= 10834 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.197049      0   1.00000 1.00000 0.0057209
## 2 0.171793      1   0.80295 0.80345 0.0063932
## 3 0.163166      2   0.63116 0.62041 0.0064840
## 4 0.146537      3   0.46799 0.47099 0.0061978
## 5 0.065141      4   0.32146 0.32471 0.0055558
## 6 0.018630      6   0.19117 0.19605 0.0045787
## 7 0.010628      7   0.17254 0.17767 0.0043932
## 8 0.010000      8   0.16192 0.17392 0.0043535
plotcp(tree2)

predikcije <- predict(tree2, test2, type='class')
confusionMatrix(predikcije, test2$Class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
##   BARBUNYA      212      0   35        0     4     2    1
##   BOMBAY          0    116    0        0     0     0    0
##   CALI           19      0  281        0    13     0    3
##   DERMASON        0      0    0      649     2    19   74
##   HOROZ           3      0   19        0   340     0    0
##   SEKER           1      0    0        9     0   364    6
##   SIRA            5      0    2       52    18    45  483
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8804          
##                  95% CI : (0.8678, 0.8923)
##     No Information Rate : 0.2557          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.855           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: BARBUNYA Class: BOMBAY Class: CALI Class: DERMASON
## Sensitivity                  0.88333       1.00000      0.8338          0.9141
## Specificity                  0.98345       1.00000      0.9857          0.9540
## Pos Pred Value               0.83465       1.00000      0.8892          0.8723
## Neg Pred Value               0.98890       1.00000      0.9772          0.9700
## Prevalence                   0.08642       0.04177      0.1214          0.2557
## Detection Rate               0.07634       0.04177      0.1012          0.2337
## Detection Prevalence         0.09147       0.04177      0.1138          0.2679
## Balanced Accuracy            0.93339       1.00000      0.9097          0.9341
##                      Class: HOROZ Class: SEKER Class: SIRA
## Sensitivity                0.9019       0.8465      0.8519
## Specificity                0.9908       0.9932      0.9448
## Pos Pred Value             0.9392       0.9579      0.7983
## Neg Pred Value             0.9847       0.9725      0.9613
## Prevalence                 0.1358       0.1548      0.2042
## Detection Rate             0.1224       0.1311      0.1739
## Detection Prevalence       0.1304       0.1368      0.2179
## Balanced Accuracy          0.9463       0.9198      0.8983
options(warn = defaultW)

Varijabla Class je jedina kvalitativna varijabla, a sve ostale varijable su kvantitativne.

Stablo koje smo vizualizirali ima 9 čvorova.

U konfuzijskoj matrici smo uočili da je klasa Bombay u potpunosti točno klasificirana, klasa Sira ima najveću pogrešku u klasifikaciji, čak 25.47% Sira grahova je krivo klasificirano.